-
Notifications
You must be signed in to change notification settings - Fork 11.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[mlir][gpu] Use known_block_size
to set maxntid
for NVVM target
#77301
Conversation
Setting thread block size with `maxntid` on the kernel has great performance benefits. In this way, downstream PTX compiler can do better register allocation. MLIR's `gpu.launch` and `gpu.launch_func` already has an attribute (`known_block_size`) that keeps the thread block size when it is known. This PR simply uses this attribute to set `maxntid`.
@llvm/pr-subscribers-mlir-gpu Author: Guray Ozen (grypp) ChangesSetting thread block size with MLIR's Full diff: https://github.com/llvm/llvm-project/pull/77301.diff 4 Files Affected:
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6a005e67ca95ba..eeb8fbbb180bad 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -85,8 +85,26 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// Add a dialect specific kernel attribute in addition to GPU kernel
// attribute. The former is necessary for further translation while the
// latter is expected by gpu.launch_func.
- if (gpuFuncOp.isKernel())
+ if (gpuFuncOp.isKernel()) {
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+
+ // Set the block size attribute if it is present.
+ if (kernelBlockSizeAttributeName.has_value()) {
+ std::optional<int32_t> dimX =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
+ std::optional<int32_t> dimY =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
+ std::optional<int32_t> dimZ =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
+ if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
+ // If any of the dimensions are missing, fill them in with 1.
+ attributes.emplace_back(
+ kernelBlockSizeAttributeName.value(),
+ rewriter.getI32ArrayAttr(
+ {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
+ }
+ }
+ }
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index a77db4a036bad3..471a688e85463e 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -36,13 +36,15 @@ struct GPUDynamicSharedMemoryOpLowering
};
struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
- GPUFuncOpLowering(const LLVMTypeConverter &converter,
- unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
- StringAttr kernelAttributeName)
+ GPUFuncOpLowering(
+ const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
+ unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
+ std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
workgroupAddrSpace(workgroupAddrSpace),
- kernelAttributeName(kernelAttributeName) {}
+ kernelAttributeName(kernelAttributeName),
+ kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -56,6 +58,9 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
/// The attribute name to use instead of `gpu.kernel`.
StringAttr kernelAttributeName;
+
+ /// The attribute name to to set block size
+ std::optional<StringAttr> kernelBlockSizeAttributeName;
};
/// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e60fe5cbd7603f..a7ac2332961ae2 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -352,7 +352,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
/*workgroupAddrSpace=*/
static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
StringAttr::get(&converter.getContext(),
- NVVM::NVVMDialect::getKernelFuncAttrName()));
+ NVVM::NVVMDialect::getKernelFuncAttrName()),
+ StringAttr::get(&converter.getContext(),
+ NVVM::NVVMDialect::getMaxntidAttrName()));
populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
"__nv_fabs");
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 20a200e812c125..c7f1d4f124c186 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -627,6 +627,15 @@ gpu.module @test_module_31 {
}
}
+gpu.module @gpumodule {
+// CHECK-LABEL: func @kernel_with_block_size()
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = [128 : i32, 1 : i32, 1 : i32]}
+ gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+ gpu.return
+ }
+}
+
+
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
%gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
|
@llvm/pr-subscribers-mlir Author: Guray Ozen (grypp) ChangesSetting thread block size with MLIR's Full diff: https://github.com/llvm/llvm-project/pull/77301.diff 4 Files Affected:
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 6a005e67ca95ba..eeb8fbbb180bad 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -85,8 +85,26 @@ GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
// Add a dialect specific kernel attribute in addition to GPU kernel
// attribute. The former is necessary for further translation while the
// latter is expected by gpu.launch_func.
- if (gpuFuncOp.isKernel())
+ if (gpuFuncOp.isKernel()) {
attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr());
+
+ // Set the block size attribute if it is present.
+ if (kernelBlockSizeAttributeName.has_value()) {
+ std::optional<int32_t> dimX =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::x);
+ std::optional<int32_t> dimY =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::y);
+ std::optional<int32_t> dimZ =
+ gpuFuncOp.getKnownBlockSize(gpu::Dimension::z);
+ if (dimX.has_value() || dimY.has_value() || dimZ.has_value()) {
+ // If any of the dimensions are missing, fill them in with 1.
+ attributes.emplace_back(
+ kernelBlockSizeAttributeName.value(),
+ rewriter.getI32ArrayAttr(
+ {dimX.value_or(1), dimY.value_or(1), dimZ.value_or(1)}));
+ }
+ }
+ }
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, /*dsoLocal=*/false, /*cconv=*/LLVM::CConv::C,
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index a77db4a036bad3..471a688e85463e 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -36,13 +36,15 @@ struct GPUDynamicSharedMemoryOpLowering
};
struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
- GPUFuncOpLowering(const LLVMTypeConverter &converter,
- unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
- StringAttr kernelAttributeName)
+ GPUFuncOpLowering(
+ const LLVMTypeConverter &converter, unsigned allocaAddrSpace,
+ unsigned workgroupAddrSpace, StringAttr kernelAttributeName,
+ std::optional<StringAttr> kernelBlockSizeAttributeName = std::nullopt)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
workgroupAddrSpace(workgroupAddrSpace),
- kernelAttributeName(kernelAttributeName) {}
+ kernelAttributeName(kernelAttributeName),
+ kernelBlockSizeAttributeName(kernelBlockSizeAttributeName) {}
LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
@@ -56,6 +58,9 @@ struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
/// The attribute name to use instead of `gpu.kernel`.
StringAttr kernelAttributeName;
+
+ /// The attribute name to to set block size
+ std::optional<StringAttr> kernelBlockSizeAttributeName;
};
/// The lowering of gpu.printf to a call to HIP hostcalls
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index e60fe5cbd7603f..a7ac2332961ae2 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -352,7 +352,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
/*workgroupAddrSpace=*/
static_cast<unsigned>(NVVM::NVVMMemorySpace::kSharedMemorySpace),
StringAttr::get(&converter.getContext(),
- NVVM::NVVMDialect::getKernelFuncAttrName()));
+ NVVM::NVVMDialect::getKernelFuncAttrName()),
+ StringAttr::get(&converter.getContext(),
+ NVVM::NVVMDialect::getMaxntidAttrName()));
populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
"__nv_fabs");
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 20a200e812c125..c7f1d4f124c186 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -627,6 +627,15 @@ gpu.module @test_module_31 {
}
}
+gpu.module @gpumodule {
+// CHECK-LABEL: func @kernel_with_block_size()
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = [128 : i32, 1 : i32, 1 : i32]}
+ gpu.func @kernel_with_block_size() kernel attributes {gpu.known_block_size = array<i32: 128, 1, 1>} {
+ gpu.return
+ }
+}
+
+
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
%gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
|
// If any of the dimensions are missing, fill them in with 1. | ||
attributes.emplace_back( | ||
kernelBlockSizeAttributeName.value(), | ||
rewriter.getI32ArrayAttr( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we use a DenseArray?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I can use DenseArray and I also think I should.
Some adjustments are needed here, and need to change some test. I can do it in a separate PR if it's okay
…lvm#77301) Setting thread block size with `maxntid` on the kernel has great performance benefits. In this way, downstream PTX compiler can do better register allocation. MLIR's `gpu.launch` and `gpu.launch_func` already has an attribute (`known_block_size`) that keeps the thread block size when it is known. This PR simply uses this attribute to set `maxntid`.
@grypp Having been poking around all the relevant code ... shouldn't (By analogy, the ROCDL version uses |
Setting thread block size with
maxntid
on the kernel has great performance benefits. In this way, downstream PTX compiler can do better register allocation.MLIR's
gpu.launch
andgpu.launch_func
already has an attribute (known_block_size
) that keeps the thread block size when it is known. This PR simply uses this attribute to setmaxntid
.